library(tidyverse)
library(readxl)
library(smacof)
library(ggplot2)
wyoming <- read_csv("~/Documents/Wyoming data import.csv")
AlaskaCVR <- read_excel("~/Documents/AlaskaCVR.xlsx", na = "under")
names<-colnames(wyoming)
names[6:10]<- c(1,2,3,4,5)
colnames(wyoming) <- names
wyoming_l <- wyoming %>%
select(BallotID, `1` ,`2` ,`3` ,`4`, `5` ) %>%
pivot_longer(-BallotID, names_to = 'rank', values_to = "candidate") %>%
filter(!candidate %in% c('under','over')) %>%
pivot_wider(names_from = candidate, values_from = rank, values_fn = first)
names<-colnames(AlaskaCVR)
names[7:11]<- c(1,2,3,4,5)
colnames(AlaskaCVR) <- names
AlaskaCVR$BallotID <- paste0(AlaskaCVR$`Tabulator Id`, AlaskaCVR$`Batch Id`, AlaskaCVR$`Record Id`)
alaska_l <- AlaskaCVR %>%
select(`BallotID`, `1` ,`2` ,`3` ,`4`, `5` ) %>%
pivot_longer(-BallotID, names_to = 'rank', values_to = "candidate") %>%
filter(!candidate %in% c('under','over', NA)) %>%
pivot_wider(names_from = candidate, values_from = rank, values_fn = first)
# To do: Losing observations between wyoming and wyoming_l, why?
w_matrix <- wyoming_l[,2:10]
a_matrix <- alaska_l[,2:10]
# Impute missing values
# Current Implementation
w_mat <- w_matrix %>%
mutate_all(~replace(., is.na(.), 6)) %>%
mapply(FUN=as.numeric)
principle_components <- w_mat%>%prcomp()
w_pca <-bind_cols(wyoming_l,as_tibble(principle_components$x))
a_mat <- a_matrix %>%
mutate_all(~replace(., is.na(.), 6)) %>%
mapply(FUN=as.numeric)
principle_components <- a_mat%>%prcomp()
a_pca <-bind_cols(alaska_l,as_tibble(principle_components$x))
w_cluster<-w_pca
library(broom)
# predict four clusters
w_kmeans <-kmeans(w_mat,centers = 4,# number of clusters
nstart = 100)# number of random starts
w_cluster$cluster<-w_kmeans$cluster
l_pca<-left_join(w_cluster, wyoming, by = 'BallotID' )
l_pca%>%
group_by(cluster, `1.y`) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
spread(cluster,count)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## # A tibble: 11 x 5
## `1.y` `1` `2` `3` `4`
## <chr> <int> <int> <int> <int>
## 1 1 3799 2195 4105 NA
## 2 11 NA 235 17 41
## 3 12 113 11 7 17
## 4 3 14 16 8 9
## 5 4 NA NA 1874 1806
## 6 5 NA 131 NA 8
## 7 7 48 30 28 27
## 8 8 NA 287 NA 401
## 9 9 27 74 26 26
## 10 over 5 3 NA 3
## 11 under 6 1 1 3
## Problem, because long form files have different length vs standard files, much harder to merge.
It appears that Biden supports (1) are overwhelmingly in cluster 3, while Sanders supporters are largely in Cluster 2. Cluster 4 seems to be voters who liked both Warren but also Biden and Sanders. Cluster 1 looks to be dominated by Biden supporters who also liked some of the more eccentric also-rans like Bloomberg and Steyer.
ggplot(data = w_cluster) +
geom_point(aes(x = PC1, y =PC2, color = factor(cluster)),alpha = .4) +
theme_minimal()
a_cluster<-a_pca
library(broom)
# predict four clusters
a_kmeans <-kmeans(a_mat,centers = 4,# number of clusters
nstart = 100)# number of random starts
a_cluster$cluster<-a_kmeans$cluster
l_pca<-left_join(a_cluster, AlaskaCVR, by = "BallotID" )
l_pca%>%
group_by(cluster, `1.y`) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
spread(cluster,count)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## # A tibble: 10 x 5
## `1.y` `1` `2` `3` `4`
## <dbl> <int> <int> <int> <int>
## 1 1 13 2317 4300 3230
## 2 3 13 12 26 13
## 3 4 2915 2427 14 2406
## 4 5 15 35 77 4
## 5 7 62 5 27 30
## 6 8 110 1121 166 2
## 7 9 11 15 47 24
## 8 11 14 93 114 26
## 9 12 41 1 16 6
## 10 NA 13 5 9 8
It appears that Biden supports (1) are overwhelmingly in cluster 3, while Sanders supporters are largely in Cluster 2. Cluster 4 seems to be voters who liked both Warren but also Biden and Sanders. Cluster 1 looks to be dominated by Biden supporters who also liked some of the more eccentric also-rans like Bloomberg and Steyer.
ggplot(data = a_cluster) +
geom_point(aes(x = PC1, y =PC2, color = factor(cluster)),alpha = .4) +
theme_minimal()
Are these clouds simmilar? Unclear.
alaska_l$state<- 'alaska'
wyoming_l$state<- 'wyoming'
b_l<- rbind(alaska_l,wyoming_l)
b_matrix <- b_l[,2:10]
b_mat <- b_matrix %>%
mutate_all(~replace(., is.na(.), 6)) %>%
mapply(FUN=as.numeric)
principle_components_b <- b_mat%>%prcomp()
b_pca <-bind_cols(b_l,as_tibble(principle_components_b$x))
ggplot(data = b_pca) +
geom_point(aes(x = PC1, y =PC2, color = state), alpha = .4) +
theme_minimal()
ggplot(data = b_pca) +
geom_density(aes(x = PC1, fill = state),alpha = .4) +
theme_minimal()
Alaska was a much better state for Sanders then Wyoming, but Sanders still lost, suggests that the middle peak in this chart is Biden-Sanders + Sanders-Biden, and that the left peak is Biden only.
b_cluster<-b_pca
# predict four clusters
b_kmeans <-kmeans(b_mat,centers = 4,# number of clusters
nstart = 100)# number of random starts
b_cluster$cluster<-b_kmeans$cluster
ggplot(data = b_cluster) +
geom_point(aes(x = PC1, y =PC2, color = factor(cluster)),alpha = .4) +
theme_minimal()
ggplot(data = b_cluster) +
geom_density(aes(x = PC1, fill = factor(cluster)),alpha = .4) +
theme_minimal()
Pretty clear that Cluster 2 is the Biden only cluster, 3 might be Biden-Sanders, 1 Sanders-Biden, 4 Sanders-Warren?
## This takes way too long, maybe a project for AWS!
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#fviz_nbclust(w_mat, FUN = kmeans, method = "wss")
# fviz_nbclust(pca_data, FUN = kmeans, method = "gap_stat") This one crashes because it runs out of memory
#fviz_nbclust(w_mat, FUN = kmeans, method = "silhouette")